Grace, Chanse, Carlene, Katelin, Vlad
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.7 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.0
✔ readr 2.1.2 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Warning message:
R graphics engine version 15 is not supported by this version of RStudio. The Plots tab will be disabled until a newer version of RStudio is installed.
library(readr)
#corporal_punishment <- read_csv("Corporal Punishment.csv")
#corporal_punishment <- corporal_punishment %>% filter(LEA_STATE == "CA") %>% select(LEAID,LEA_NAME,SCHID, SCH_NAME,COMBOKEY,SCH_CORPINSTANCES_IND,SCH_CORPINSTANCES_WDIS,SCH_CORPINSTANCES_WODIS)
#corporal_punishment <- na_if(corporal_punishment,-9)
#corporal_punishment <- corporal_punishment %>% drop_na()
#head(corporal_punishment,15)
#summary(corporal_punishment)
credit_recovery <- read_csv("Credit Recovery.csv")
Rows: 97632 Columns: 10── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ, SCH_CREDITRECOV...
dbl (1): SCH_CREDITRECOVERYENR
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
credit_recovery <- credit_recovery %>% filter(LEA_STATE == "CA") %>% na_if(-9) %>% drop_na()
head(credit_recovery,15)
summary(credit_recovery)
LEA_STATE LEA_STATE_NAME LEAID LEA_NAME SCHID
Length:1763 Length:1763 Length:1763 Length:1763 Length:1763
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
SCH_NAME COMBOKEY JJ SCH_CREDITRECOVERY_IND SCH_CREDITRECOVERYENR
Length:1763 Length:1763 Length:1763 Length:1763 Min. : 0
Class :character Class :character Class :character Class :character 1st Qu.: 20
Mode :character Mode :character Mode :character Mode :character Median : 74
Mean : 198
3rd Qu.: 219
Max. :12074
d_enrollment <- read_csv("Dual Enrollment.csv")
Rows: 97632 Columns: 29── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ, SCH_DUAL_IND
dbl (20): SCH_DUALENR_HI_M, SCH_DUALENR_HI_F, SCH_DUALENR_AM_M, SCH_DUALENR_AM_F, SCH_DUALENR_AS_M,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_enrollment <- d_enrollment %>% filter(LEA_STATE == "CA") %>% na_if(-9) %>% drop_na() %>% select(LEAID,LEA_NAME,SCHID, SCH_NAME,COMBOKEY,SCH_DUAL_IND,TOT_DUAL_F,TOT_DUAL_M,SCH_DUALENR_LEP_F,SCH_DUALENR_LEP_M,SCH_DUALENR_IDEA_F,SCH_DUALENR_IDEA_M)
#head(d_enrollment,15)
#summary(d_enrollment)
offenses <- read_csv("Offenses.csv")
Rows: 97632 Columns: 22── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ, SCH_FIREARM_IN...
dbl (12): SCH_OFFENSE_RAPE, SCH_OFFENSE_BATT, SCH_OFFENSE_ROBWW, SCH_OFFENSE_ROBWX, SCH_OFFENSE_ROB...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
offenses <- offenses %>% filter(LEA_STATE == "CA")
offenses <- na_if(offenses,-9)
head(offenses,10)
NA
g_t <- read_csv("Gifted and Talented.csv")
Rows: 97632 Columns: 29── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ, SCH_GT_IND
dbl (20): SCH_GTENR_HI_M, SCH_GTENR_HI_F, SCH_GTENR_AM_M, SCH_GTENR_AM_F, SCH_GTENR_AS_M, SCH_GTENR...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
g_t <- g_t %>% filter(LEA_STATE == "CA") %>% na_if(-9) %>% drop_na() %>% select(LEAID,LEA_NAME,SCHID, SCH_NAME,COMBOKEY,SCH_GT_IND,SCH_GTENR_LEP_M,SCH_GTENR_LEP_F,SCH_GTENR_IDEA_M,SCH_GTENR_IDEA_F)
summary(g_t)
LEAID LEA_NAME SCHID SCH_NAME COMBOKEY
Length:5479 Length:5479 Length:5479 Length:5479 Length:5479
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
SCH_GT_IND SCH_GTENR_LEP_M SCH_GTENR_LEP_F SCH_GTENR_IDEA_M SCH_GTENR_IDEA_F
Length:5479 Min. : 0.0 Min. : 0.0 Min. : 0 Min. : 0.00
Class :character 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.00
Mode :character Median : 0.0 Median : 0.0 Median : 0 Median : 0.00
Mean : 0.9 Mean : 0.7 Mean : 1 Mean : 0.37
3rd Qu.: 1.0 3rd Qu.: 0.0 3rd Qu.: 1 3rd Qu.: 0.00
Max. :50.0 Max. :37.0 Max. :71 Max. :30.00
head(g_t,15)
# load in SAT participation here
SAT_ACT <- read_csv("SAT and ACT.csv")
IB <- read_csv("International Baccalaureate.csv")
Rows: 97632 Columns: 29── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ, SCH_IBENR_IND
dbl (20): SCH_IBENR_HI_M, SCH_IBENR_HI_F, SCH_IBENR_AM_M, SCH_IBENR_AM_F, SCH_IBENR_AS_M, SCH_IBENR...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IB <- IB %>% filter(LEA_STATE == "CA") %>% na_if(-9) %>% drop_na() %>% select(LEAID,LEA_NAME,SCHID, SCH_NAME,COMBOKEY,SCH_IBENR_IND,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F)
summary(IB)
LEAID LEA_NAME SCHID SCH_NAME COMBOKEY
Length:97 Length:97 Length:97 Length:97 Length:97
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
SCH_IBENR_IND SCH_IBENR_LEP_M SCH_IBENR_LEP_F SCH_IBENR_IDEA_M SCH_IBENR_IDEA_F
Length:97 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
Class :character 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
Mode :character Median : 0.0 Median : 1.0 Median : 1.0 Median : 1.0
Mean : 3.5 Mean : 3.7 Mean : 3.3 Mean : 2.4
3rd Qu.: 2.0 3rd Qu.: 3.0 3rd Qu.: 3.0 3rd Qu.: 2.0
Max. :79.0 Max. :65.0 Max. :92.0 Max. :53.0
head(IB,15)
#load in school characteristics
characteristics <- read_csv("School Characteristics.csv")
expenditures <- read_csv("School Expenditures.csv")
Rows: 97632 Columns: 27── Column specification ───────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME, SCHID, SCH_NAME, COMBOKEY, JJ
dbl (19): SCH_SAL_TOTPERS_WOFED, SCH_FTE_TEACH_WOFED, SCH_SAL_TEACH_WOFED, SCH_NPE_WOFED, SCH_FTE_A...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
expenditures <- expenditures %>% filter(LEA_STATE == "CA")
head(expenditures)
expenditure_model <- lm(SCH_FTE_AID_WOFED ~ SCH_FTE_SUP_WOFED,
data = expenditures,
family = binomial)
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument ‘family’ will be disregarded
summary(expenditure_model)
Call:
lm(formula = SCH_FTE_AID_WOFED ~ SCH_FTE_SUP_WOFED, data = expenditures,
family = binomial)
Residuals:
Min 1Q Median 3Q Max
-906 -5 -4 -2 27726
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.571272 2.745465 1.3 0.19
SCH_FTE_SUP_WOFED 0.465533 0.000914 509.1 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 276 on 10119 degrees of freedom
Multiple R-squared: 0.962, Adjusted R-squared: 0.962
F-statistic: 2.59e+05 on 1 and 10119 DF, p-value: <0.0000000000000002
college_going = read.delim("collegegoing.txt", na.strings=c("","*", "NA"))
# Chanse's exploration
college_going_1 <- college_going %>%
mutate_at(c(13:20), as.numeric) %>%
filter(!is.na(College.Going.Rate...Total..12.Months.)) %>%
mutate(CharterSchool = (CharterSchool == "Yes"))
charter_schools <- college_going_1 %>% filter(CharterSchool == TRUE)
noncharter_schools <- college_going_1 %>% filter(CharterSchool == FALSE)
mean(charter_schools$College.Going.Rate...Total..12.Months.)
[1] 49.76593
mean(noncharter_schools$College.Going.Rate...Total..12.Months.)
[1] 56.9517
# Grace clean up college_going file
college_going <- college_going %>%
select(CountyCode, DistrictCode, SchoolCode, DistrictName, SchoolName, CharterSchool, High.School.Completers, Enrolled.In.College...Total..12.Months., College.Going.Rate...Total..12.Months., Enrolled.In.State..12.Months.,Enrolled.Out.of.State..12.Months.) %>%
rename(Total_Enrolled_College = Enrolled.In.College...Total..12.Months., Total_College_Going_Rate = College.Going.Rate...Total..12.Months., Total_Enrolled_In_State = Enrolled.In.State..12.Months., Total_Enrolled_Out_State = Enrolled.Out.of.State..12.Months.)
#463797 rows
college_going <- na.omit(college_going)
college_going
# Grace load in graduation rate
grad_rate = read.delim("graduationrate.txt", na.strings=c("","*", "NA"))
grad_rate <- grad_rate %>%
select(CountyCode, DistrictCode, SchoolCode, DistrictName, SchoolName, CharterSchool, CohortStudents, Regular.HS.Diploma.Graduates..Count.,Regular.HS.Diploma.Graduates..Rate.) %>% rename(Num_Students = CohortStudents, HS_Diploma_Count = Regular.HS.Diploma.Graduates..Count., HS_Diploma_Rate = Regular.HS.Diploma.Graduates..Rate.)
grad_rate <- na.omit(grad_rate)
grad_rate
# use inner join to merge all the cleaned up data files
# run regression using all the covariates